In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import lightgbm as lgb
import gc

In [2]:
import gensim
import pickle

In [3]:
from sklearn.model_selection import train_test_split

Load Data


In [4]:
IDIR = '../data/raw/'
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))
print('train {}: {}'.format(train.shape, ', '.join(products.columns)))


loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered
train (1384617, 4): product_id, aisle_id, department_id

In [5]:
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods
gc.collect()


computing product f
Out[5]:
62

In [6]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)


add order info to priors

In [7]:
### user features
print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders[orders.eval_set == "prior"].groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders[orders.eval_set == "prior"].groupby('user_id').size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

users = users.join(usr)
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

del usr, priors
gc.collect()


computing user f
('user f', (206209, 6))
Out[7]:
57

In [35]:
### build list of candidate products to reorder, with features ###
def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)

In [32]:
### train / test orders ###
print('split orders : train, test')
train_orders = orders[orders.eval_set == 'train']
test_orders = orders[orders.eval_set == 'test']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)


split orders : train, test

In [10]:
df_train, labels = features(train_orders, labels_given=True)


build candidate list
('order row', 10000)
('order row', 20000)
('order row', 30000)
('order row', 40000)
('order row', 50000)
('order row', 60000)
('order row', 70000)
('order row', 80000)
('order row', 90000)
('order row', 100000)
('order row', 110000)
('order row', 120000)
('order row', 130000)
user related features
order related features
product related features
order_id                              int32
product_id                            int32
user_id                               int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
dow                                    int8
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
dtype: object
Index                                     72
order_id                            33898644
product_id                          33898644
user_id                             33898644
user_total_orders                   16949322
user_total_items                    16949322
total_distinct_items                16949322
user_average_days_between_orders    33898644
user_average_basket                 33898644
dow                                  8474661
order_hour_of_day                    8474661
days_since_prior_order              33898644
days_since_ratio                    33898644
aisle_id                             8474661
department_id                        8474661
product_orders                      33898644
product_reorders                    33898644
product_reorder_rate                33898644
dtype: int64

In [11]:
del train_orders
gc.collect()


Out[11]:
92

In [12]:
df_train.shape


Out[12]:
(8474661, 17)

In [13]:
df_train.head()


Out[13]:
order_id product_id user_id user_total_orders user_total_items total_distinct_items user_average_days_between_orders user_average_basket dow order_hour_of_day days_since_prior_order days_since_ratio aisle_id department_id product_orders product_reorders product_reorder_rate
0 1187899 17122 1 10 59 18 19.555555 5.9 4 8 14.0 0.715909 24 4 13880 9377.0 0.675576
1 1187899 196 1 10 59 18 19.555555 5.9 4 8 14.0 0.715909 77 7 35791 27791.0 0.776480
2 1187899 26405 1 10 59 18 19.555555 5.9 4 8 14.0 0.715909 54 17 1214 536.0 0.441516
3 1187899 13032 1 10 59 18 19.555555 5.9 4 8 14.0 0.715909 121 14 3751 2465.0 0.657158
4 1187899 39657 1 10 59 18 19.555555 5.9 4 8 14.0 0.715909 45 19 5019 3846.0 0.766288

Load other pre-calculated user features and product embedding features


In [12]:
# all user feature(others) 
all_users_features_df = pd.read_pickle("../data/processed/cleaned_all_users_features.pickle")

In [14]:
cols = all_users_features_df.columns
dow_cols = cols[cols.str.startswith('dow_')].tolist() + cols[cols.str.startswith('daytime_')].tolist()
most_cols = cols[cols.str.startswith('most_')].tolist()
top_cols = cols[cols.str.startswith('top')].tolist()
emb_cols = cols[cols.str.startswith('emb_')].tolist()

In [15]:
print("join with the user features")
to_join = ["user_id", 'user_avg_reordered', 'user_perc_reordered'] + most_cols + dow_cols + emb_cols + top_cols
df_train = pd.merge(df_train, all_users_features_df[to_join], on ="user_id")


join with the user features

In [16]:
gc.collect()


Out[16]:
21

In [17]:
df_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8474661 entries, 0 to 8474660
Columns: 137 entries, order_id to top3_reordered_pid
dtypes: float32(108), int16(3), int32(7), int8(2), uint8(17)
memory usage: 3.9 GB

In [15]:
#product_emd = gensim.models.Word2Vec.load("../data/interim/product2vec.model")

In [16]:
#product_emd_dict = {k: product_emd[k]  for k in product_emd.wv.vocab.keys()}

In [17]:
# product_emd_df = np.round(pd.DataFrame.from_dict(product_emd_dict, orient='index', dtype = np.float32),2).\
#                     add_prefix('prod2vec_').reset_index().\
#                     rename(columns = {'index': 'product_id'})

In [23]:
product_emd_df.head()


Out[23]:
product_id prod2vec_0 prod2vec_1 prod2vec_2 prod2vec_3 prod2vec_4 prod2vec_5 prod2vec_6 prod2vec_7 prod2vec_8 ... prod2vec_90 prod2vec_91 prod2vec_92 prod2vec_93 prod2vec_94 prod2vec_95 prod2vec_96 prod2vec_97 prod2vec_98 prod2vec_99
0 13357 -0.00 -0.22 0.19 -0.32 0.16 -0.17 -0.24 0.11 0.19 ... 0.13 -0.13 -0.27 0.15 -0.43 0.07 -0.15 0.28 0.02 0.12
1 11542 0.12 0.29 -0.02 -0.07 -0.12 -0.02 -0.22 0.17 0.09 ... -0.21 0.07 -0.19 -0.02 -0.18 0.20 -0.13 -0.02 -0.26 0.18
2 11543 0.77 -0.30 0.90 -0.74 0.47 0.93 0.88 -0.38 0.26 ... -1.75 0.18 -1.21 -1.28 -0.60 0.36 1.37 1.58 -1.18 0.82
3 11540 0.77 -0.38 -0.62 -0.80 0.26 -1.03 -0.77 0.50 -0.51 ... -0.47 -0.02 -0.26 1.60 0.46 0.93 -0.12 -0.64 0.43 -0.51
4 11541 -0.23 -0.08 0.20 -0.07 0.04 -0.36 0.01 0.34 -0.00 ... -0.15 -0.08 0.02 0.37 0.18 -0.37 -0.41 -0.01 0.07 0.38

5 rows × 101 columns


In [22]:
#prod2vec_cols = product_emd_df.columns[product_emd_df.columns.str.startswith('prod2vec')]

In [23]:
# for col in prod2vec_cols:
#     product_emd_df[col] = product_emd_df[col].astype('float32')

In [32]:
#product_emd_df.to_pickle("../data/interim/prod2vec_df.pickle")
product_emd_df = pd.read_pickle("../data/interim/prod2vec_df.pickle")

In [33]:
product_emd_df.head()


Out[33]:
product_id prod2vec_0 prod2vec_1 prod2vec_2 prod2vec_3 prod2vec_4 prod2vec_5 prod2vec_6 prod2vec_7 prod2vec_8 ... prod2vec_90 prod2vec_91 prod2vec_92 prod2vec_93 prod2vec_94 prod2vec_95 prod2vec_96 prod2vec_97 prod2vec_98 prod2vec_99
0 13357 -0.00 -0.22 0.19 -0.32 0.16 -0.17 -0.24 0.11 0.19 ... 0.13 -0.13 -0.27 0.15 -0.43 0.07 -0.15 0.28 0.02 0.12
1 11542 0.12 0.29 -0.02 -0.07 -0.12 -0.02 -0.22 0.17 0.09 ... -0.21 0.07 -0.19 -0.02 -0.18 0.20 -0.13 -0.02 -0.26 0.18
2 11543 0.77 -0.30 0.90 -0.74 0.47 0.93 0.88 -0.38 0.26 ... -1.75 0.18 -1.21 -1.28 -0.60 0.36 1.37 1.58 -1.18 0.82
3 11540 0.77 -0.38 -0.62 -0.80 0.26 -1.03 -0.77 0.50 -0.51 ... -0.47 -0.02 -0.26 1.60 0.46 0.93 -0.12 -0.64 0.43 -0.51
4 11541 -0.23 -0.08 0.20 -0.07 0.04 -0.36 0.01 0.34 -0.00 ... -0.15 -0.08 0.02 0.37 0.18 -0.37 -0.41 -0.01 0.07 0.38

5 rows × 101 columns


In [39]:
product_emd_df['product_id'] = product_emd_df['product_id'].astype('int32')

In [42]:
print("joint product embedding")
df_train = pd.merge(df_train, product_emd_df, on = "product_id", how = "left")


joint product embedding

In [40]:
#df_train.drop(prod2vec_cols, axis = 1, inplace=True)

In [44]:
df_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8474661 entries, 0 to 8474660
Columns: 134 entries, order_id to prod2vec_99
dtypes: float32(108), int16(3), int32(3), int8(2), object(1), uint8(17)
memory usage: 3.8+ GB

In [18]:
del all_users_features_df#, product_emd_df
gc.collect()


Out[18]:
24

In [47]:
df_train[prod2vec_cols] = df_train[prod2vec_cols].fillna(df_train[prod2vec_cols].mean())

In [48]:
df_train.shape


Out[48]:
(8474661, 134)

Train LightGBM

with less features


In [19]:
features = df_train.columns
dow_cols = features[features.str.startswith("dow_")].tolist()
daytime_cols = features[features.str.startswith("daytime_")].tolist()
emb_cols = features[features.str.startswith("emb_")].tolist()
#most_cols = features[features.str.startswith("most_")].tolist()
#top_cols = features[features.str.startswith("top_")].tolist()
#prod2vec_cols = features[features.str.startswith("prod2vec")].tolist()
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate'] + dow_cols + daytime_cols  + emb_cols

In [26]:
gc.collect()


Out[26]:
7

In [29]:
print("split the train and validation set")
X_train, X_valid, y_train, y_valid = train_test_split(df_train[f_to_use], labels, test_size = 0.3, random_state=2017)


split the train and validation set

In [30]:
X_train = pd.DataFrame(X_train, columns = f_to_use)
X_valid = pd.DataFrame(X_valid, columns = f_to_use)

In [31]:
print('formating training and validation dataset for lgb')
d_train = lgb.Dataset(X_train,
                      label=y_train,
                      categorical_feature=['aisle_id', 'department_id'])  
d_valid = lgb.Dataset(X_valid,
                      label=y_valid,
                      categorical_feature=['aisle_id', 'department_id'])


formating training and validation dataset for lgb

In [32]:
#del df_train
gc.collect()


Out[32]:
19

In [33]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 80,
    'max_depth': 10,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.9,
    'bagging_freq': 8
}
ROUNDS = 300

In [34]:
print('Training light GBM ...')
bst = lgb.train(params, d_train, valid_sets= [d_valid], num_boost_round= ROUNDS, early_stopping_rounds = 10)


Training light GBM ...
[1]	valid_0's binary_logloss: 0.629905
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's binary_logloss: 0.578155
[3]	valid_0's binary_logloss: 0.535303
[4]	valid_0's binary_logloss: 0.499919
[5]	valid_0's binary_logloss: 0.469775
[6]	valid_0's binary_logloss: 0.444261
[7]	valid_0's binary_logloss: 0.422744
[8]	valid_0's binary_logloss: 0.404203
[9]	valid_0's binary_logloss: 0.388396
[10]	valid_0's binary_logloss: 0.374922
[11]	valid_0's binary_logloss: 0.363287
[12]	valid_0's binary_logloss: 0.353133
[13]	valid_0's binary_logloss: 0.344392
[14]	valid_0's binary_logloss: 0.33689
[15]	valid_0's binary_logloss: 0.33057
[16]	valid_0's binary_logloss: 0.325304
[17]	valid_0's binary_logloss: 0.320519
[18]	valid_0's binary_logloss: 0.316307
[19]	valid_0's binary_logloss: 0.312849
[20]	valid_0's binary_logloss: 0.309789
[21]	valid_0's binary_logloss: 0.307106
[22]	valid_0's binary_logloss: 0.304708
[23]	valid_0's binary_logloss: 0.302734
[24]	valid_0's binary_logloss: 0.301035
[25]	valid_0's binary_logloss: 0.299465
[26]	valid_0's binary_logloss: 0.298184
[27]	valid_0's binary_logloss: 0.297019
[28]	valid_0's binary_logloss: 0.296073
[29]	valid_0's binary_logloss: 0.29515
[30]	valid_0's binary_logloss: 0.294345
[31]	valid_0's binary_logloss: 0.293653
[32]	valid_0's binary_logloss: 0.293061
[33]	valid_0's binary_logloss: 0.292538
[34]	valid_0's binary_logloss: 0.292053
[35]	valid_0's binary_logloss: 0.291642
[36]	valid_0's binary_logloss: 0.291278
[37]	valid_0's binary_logloss: 0.290982
[38]	valid_0's binary_logloss: 0.290674
[39]	valid_0's binary_logloss: 0.290398
[40]	valid_0's binary_logloss: 0.290161
[41]	valid_0's binary_logloss: 0.289955
[42]	valid_0's binary_logloss: 0.289783
[43]	valid_0's binary_logloss: 0.289583
[44]	valid_0's binary_logloss: 0.289445
[45]	valid_0's binary_logloss: 0.289304
[46]	valid_0's binary_logloss: 0.289127
[47]	valid_0's binary_logloss: 0.289007
[48]	valid_0's binary_logloss: 0.288885
[49]	valid_0's binary_logloss: 0.28875
[50]	valid_0's binary_logloss: 0.288639
[51]	valid_0's binary_logloss: 0.288534
[52]	valid_0's binary_logloss: 0.288444
[53]	valid_0's binary_logloss: 0.288353
[54]	valid_0's binary_logloss: 0.288262
[55]	valid_0's binary_logloss: 0.288177
[56]	valid_0's binary_logloss: 0.288107
[57]	valid_0's binary_logloss: 0.288022
[58]	valid_0's binary_logloss: 0.287923
[59]	valid_0's binary_logloss: 0.287859
[60]	valid_0's binary_logloss: 0.287779
[61]	valid_0's binary_logloss: 0.287707
[62]	valid_0's binary_logloss: 0.287628
[63]	valid_0's binary_logloss: 0.287563
[64]	valid_0's binary_logloss: 0.287518
[65]	valid_0's binary_logloss: 0.287465
[66]	valid_0's binary_logloss: 0.287409
[67]	valid_0's binary_logloss: 0.287356
[68]	valid_0's binary_logloss: 0.287303
[69]	valid_0's binary_logloss: 0.287244
[70]	valid_0's binary_logloss: 0.287201
[71]	valid_0's binary_logloss: 0.287153
[72]	valid_0's binary_logloss: 0.287111
[73]	valid_0's binary_logloss: 0.287061
[74]	valid_0's binary_logloss: 0.287015
[75]	valid_0's binary_logloss: 0.286974
[76]	valid_0's binary_logloss: 0.286914
[77]	valid_0's binary_logloss: 0.286877
[78]	valid_0's binary_logloss: 0.286841
[79]	valid_0's binary_logloss: 0.286808
[80]	valid_0's binary_logloss: 0.28677
[81]	valid_0's binary_logloss: 0.286738
[82]	valid_0's binary_logloss: 0.286684
[83]	valid_0's binary_logloss: 0.286652
[84]	valid_0's binary_logloss: 0.286617
[85]	valid_0's binary_logloss: 0.286581
[86]	valid_0's binary_logloss: 0.286546
[87]	valid_0's binary_logloss: 0.286517
[88]	valid_0's binary_logloss: 0.286492
[89]	valid_0's binary_logloss: 0.286467
[90]	valid_0's binary_logloss: 0.286437
[91]	valid_0's binary_logloss: 0.28641
[92]	valid_0's binary_logloss: 0.286382
[93]	valid_0's binary_logloss: 0.286353
[94]	valid_0's binary_logloss: 0.286295
[95]	valid_0's binary_logloss: 0.286267
[96]	valid_0's binary_logloss: 0.286246
[97]	valid_0's binary_logloss: 0.286203
[98]	valid_0's binary_logloss: 0.286176
[99]	valid_0's binary_logloss: 0.286148
[100]	valid_0's binary_logloss: 0.286116
[101]	valid_0's binary_logloss: 0.286092
[102]	valid_0's binary_logloss: 0.28606
[103]	valid_0's binary_logloss: 0.286033
[104]	valid_0's binary_logloss: 0.286006
[105]	valid_0's binary_logloss: 0.285979
[106]	valid_0's binary_logloss: 0.28596
[107]	valid_0's binary_logloss: 0.285936
[108]	valid_0's binary_logloss: 0.285912
[109]	valid_0's binary_logloss: 0.285892
[110]	valid_0's binary_logloss: 0.285869
[111]	valid_0's binary_logloss: 0.285847
[112]	valid_0's binary_logloss: 0.285825
[113]	valid_0's binary_logloss: 0.285801
[114]	valid_0's binary_logloss: 0.28578
[115]	valid_0's binary_logloss: 0.285747
[116]	valid_0's binary_logloss: 0.285726
[117]	valid_0's binary_logloss: 0.285702
[118]	valid_0's binary_logloss: 0.285687
[119]	valid_0's binary_logloss: 0.285652
[120]	valid_0's binary_logloss: 0.285634
[121]	valid_0's binary_logloss: 0.285613
[122]	valid_0's binary_logloss: 0.28559
[123]	valid_0's binary_logloss: 0.285574
[124]	valid_0's binary_logloss: 0.285555
[125]	valid_0's binary_logloss: 0.285539
[126]	valid_0's binary_logloss: 0.285521
[127]	valid_0's binary_logloss: 0.285501
[128]	valid_0's binary_logloss: 0.285489
[129]	valid_0's binary_logloss: 0.28547
[130]	valid_0's binary_logloss: 0.285456
[131]	valid_0's binary_logloss: 0.285439
[132]	valid_0's binary_logloss: 0.28542
[133]	valid_0's binary_logloss: 0.285396
[134]	valid_0's binary_logloss: 0.285382
[135]	valid_0's binary_logloss: 0.285364
[136]	valid_0's binary_logloss: 0.285347
[137]	valid_0's binary_logloss: 0.285333
[138]	valid_0's binary_logloss: 0.285313
[139]	valid_0's binary_logloss: 0.285292
[140]	valid_0's binary_logloss: 0.285282
[141]	valid_0's binary_logloss: 0.285267
[142]	valid_0's binary_logloss: 0.28525
[143]	valid_0's binary_logloss: 0.285231
[144]	valid_0's binary_logloss: 0.285215
[145]	valid_0's binary_logloss: 0.285205
[146]	valid_0's binary_logloss: 0.28519
[147]	valid_0's binary_logloss: 0.285177
[148]	valid_0's binary_logloss: 0.285161
[149]	valid_0's binary_logloss: 0.285147
[150]	valid_0's binary_logloss: 0.285123
[151]	valid_0's binary_logloss: 0.285111
[152]	valid_0's binary_logloss: 0.285098
[153]	valid_0's binary_logloss: 0.285084
[154]	valid_0's binary_logloss: 0.285071
[155]	valid_0's binary_logloss: 0.285065
[156]	valid_0's binary_logloss: 0.285047
[157]	valid_0's binary_logloss: 0.285039
[158]	valid_0's binary_logloss: 0.28503
[159]	valid_0's binary_logloss: 0.285021
[160]	valid_0's binary_logloss: 0.285011
[161]	valid_0's binary_logloss: 0.284993
[162]	valid_0's binary_logloss: 0.284981
[163]	valid_0's binary_logloss: 0.28497
[164]	valid_0's binary_logloss: 0.284954
[165]	valid_0's binary_logloss: 0.284939
[166]	valid_0's binary_logloss: 0.284927
[167]	valid_0's binary_logloss: 0.284919
[168]	valid_0's binary_logloss: 0.284894
[169]	valid_0's binary_logloss: 0.284883
[170]	valid_0's binary_logloss: 0.28487
[171]	valid_0's binary_logloss: 0.28486
[172]	valid_0's binary_logloss: 0.284857
[173]	valid_0's binary_logloss: 0.284844
[174]	valid_0's binary_logloss: 0.284836
[175]	valid_0's binary_logloss: 0.284822
[176]	valid_0's binary_logloss: 0.284818
[177]	valid_0's binary_logloss: 0.284815
[178]	valid_0's binary_logloss: 0.284802
[179]	valid_0's binary_logloss: 0.284785
[180]	valid_0's binary_logloss: 0.284773
[181]	valid_0's binary_logloss: 0.284758
[182]	valid_0's binary_logloss: 0.284749
[183]	valid_0's binary_logloss: 0.284736
[184]	valid_0's binary_logloss: 0.284725
[185]	valid_0's binary_logloss: 0.284712
[186]	valid_0's binary_logloss: 0.284702
[187]	valid_0's binary_logloss: 0.284682
[188]	valid_0's binary_logloss: 0.284668
[189]	valid_0's binary_logloss: 0.284661
[190]	valid_0's binary_logloss: 0.284655
[191]	valid_0's binary_logloss: 0.284644
[192]	valid_0's binary_logloss: 0.284633
[193]	valid_0's binary_logloss: 0.284613
[194]	valid_0's binary_logloss: 0.284601
[195]	valid_0's binary_logloss: 0.28459
[196]	valid_0's binary_logloss: 0.284582
[197]	valid_0's binary_logloss: 0.284579
[198]	valid_0's binary_logloss: 0.284557
[199]	valid_0's binary_logloss: 0.284543
[200]	valid_0's binary_logloss: 0.284529
[201]	valid_0's binary_logloss: 0.284517
[202]	valid_0's binary_logloss: 0.284507
[203]	valid_0's binary_logloss: 0.284496
[204]	valid_0's binary_logloss: 0.284487
[205]	valid_0's binary_logloss: 0.284476
[206]	valid_0's binary_logloss: 0.284456
[207]	valid_0's binary_logloss: 0.284448
[208]	valid_0's binary_logloss: 0.284438
[209]	valid_0's binary_logloss: 0.284434
[210]	valid_0's binary_logloss: 0.284425
[211]	valid_0's binary_logloss: 0.284404
[212]	valid_0's binary_logloss: 0.284396
[213]	valid_0's binary_logloss: 0.284383
[214]	valid_0's binary_logloss: 0.284374
[215]	valid_0's binary_logloss: 0.284362
[216]	valid_0's binary_logloss: 0.284351
[217]	valid_0's binary_logloss: 0.28434
[218]	valid_0's binary_logloss: 0.284331
[219]	valid_0's binary_logloss: 0.284316
[220]	valid_0's binary_logloss: 0.284308
[221]	valid_0's binary_logloss: 0.284298
[222]	valid_0's binary_logloss: 0.284292
[223]	valid_0's binary_logloss: 0.284282
[224]	valid_0's binary_logloss: 0.284273
[225]	valid_0's binary_logloss: 0.28426
[226]	valid_0's binary_logloss: 0.284242
[227]	valid_0's binary_logloss: 0.28423
[228]	valid_0's binary_logloss: 0.284228
[229]	valid_0's binary_logloss: 0.284214
[230]	valid_0's binary_logloss: 0.284199
[231]	valid_0's binary_logloss: 0.284182
[232]	valid_0's binary_logloss: 0.284169
[233]	valid_0's binary_logloss: 0.284155
[234]	valid_0's binary_logloss: 0.284145
[235]	valid_0's binary_logloss: 0.284136
[236]	valid_0's binary_logloss: 0.284132
[237]	valid_0's binary_logloss: 0.284123
[238]	valid_0's binary_logloss: 0.284116
[239]	valid_0's binary_logloss: 0.284106
[240]	valid_0's binary_logloss: 0.284096
[241]	valid_0's binary_logloss: 0.284088
[242]	valid_0's binary_logloss: 0.28408
[243]	valid_0's binary_logloss: 0.284071
[244]	valid_0's binary_logloss: 0.284068
[245]	valid_0's binary_logloss: 0.284057
[246]	valid_0's binary_logloss: 0.284045
[247]	valid_0's binary_logloss: 0.284027
[248]	valid_0's binary_logloss: 0.284021
[249]	valid_0's binary_logloss: 0.284017
[250]	valid_0's binary_logloss: 0.28401
[251]	valid_0's binary_logloss: 0.284
[252]	valid_0's binary_logloss: 0.283993
[253]	valid_0's binary_logloss: 0.283987
[254]	valid_0's binary_logloss: 0.283973
[255]	valid_0's binary_logloss: 0.283968
[256]	valid_0's binary_logloss: 0.283957
[257]	valid_0's binary_logloss: 0.283952
[258]	valid_0's binary_logloss: 0.283946
[259]	valid_0's binary_logloss: 0.28394
[260]	valid_0's binary_logloss: 0.283934
[261]	valid_0's binary_logloss: 0.283928
[262]	valid_0's binary_logloss: 0.283921
[263]	valid_0's binary_logloss: 0.283912
[264]	valid_0's binary_logloss: 0.283904
[265]	valid_0's binary_logloss: 0.283897
[266]	valid_0's binary_logloss: 0.28389
[267]	valid_0's binary_logloss: 0.28388
[268]	valid_0's binary_logloss: 0.283874
[269]	valid_0's binary_logloss: 0.283867
[270]	valid_0's binary_logloss: 0.283861
[271]	valid_0's binary_logloss: 0.28385
[272]	valid_0's binary_logloss: 0.283841
[273]	valid_0's binary_logloss: 0.283831
[274]	valid_0's binary_logloss: 0.283823
[275]	valid_0's binary_logloss: 0.28381
[276]	valid_0's binary_logloss: 0.283797
[277]	valid_0's binary_logloss: 0.283792
[278]	valid_0's binary_logloss: 0.283779
[279]	valid_0's binary_logloss: 0.283775
[280]	valid_0's binary_logloss: 0.283767
[281]	valid_0's binary_logloss: 0.283757
[282]	valid_0's binary_logloss: 0.283752
[283]	valid_0's binary_logloss: 0.283746
[284]	valid_0's binary_logloss: 0.283742
[285]	valid_0's binary_logloss: 0.283741
[286]	valid_0's binary_logloss: 0.283729
[287]	valid_0's binary_logloss: 0.283726
[288]	valid_0's binary_logloss: 0.283723
[289]	valid_0's binary_logloss: 0.283714
[290]	valid_0's binary_logloss: 0.283707
[291]	valid_0's binary_logloss: 0.283697
[292]	valid_0's binary_logloss: 0.28369
[293]	valid_0's binary_logloss: 0.283686
[294]	valid_0's binary_logloss: 0.283677
[295]	valid_0's binary_logloss: 0.283668
[296]	valid_0's binary_logloss: 0.283658
[297]	valid_0's binary_logloss: 0.283647
[298]	valid_0's binary_logloss: 0.283638
[299]	valid_0's binary_logloss: 0.283634
[300]	valid_0's binary_logloss: 0.283627

In [35]:
bst.save_model('../models/lightGBM_80_10.txt', num_iteration=bst.best_iteration)

In [20]:
most_cols = features[features.str.startswith("most_")].tolist()
top_cols = features[features.str.startswith("top")].tolist()
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate'] + dow_cols + daytime_cols  + emb_cols + most_cols + top_cols
print(f_to_use)


['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow', 'user_average_days_between_orders', 'user_average_basket', 'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio', 'aisle_id', 'department_id', 'product_orders', 'product_reorders', 'product_reorder_rate', u'dow_0', u'dow_1', u'dow_2', u'dow_3', u'dow_4', u'dow_5', u'dow_6', u'daytime_sleeping', u'daytime_morning', u'daytime_noon', u'daytime_afternoon', u'daytime_evening', u'daytime_night', u'emb_0', u'emb_1', u'emb_2', u'emb_3', u'emb_4', u'emb_5', u'emb_6', u'emb_7', u'emb_8', u'emb_9', u'emb_10', u'emb_11', u'emb_12', u'emb_13', u'emb_14', u'emb_15', u'emb_16', u'emb_17', u'emb_18', u'emb_19', u'emb_20', u'emb_21', u'emb_22', u'emb_23', u'emb_24', u'emb_25', u'emb_26', u'emb_27', u'emb_28', u'emb_29', u'emb_30', u'emb_31', u'emb_32', u'emb_33', u'emb_34', u'emb_35', u'emb_36', u'emb_37', u'emb_38', u'emb_39', u'emb_40', u'emb_41', u'emb_42', u'emb_43', u'emb_44', u'emb_45', u'emb_46', u'emb_47', u'emb_48', u'emb_49', u'emb_50', u'emb_51', u'emb_52', u'emb_53', u'emb_54', u'emb_55', u'emb_56', u'emb_57', u'emb_58', u'emb_59', u'emb_60', u'emb_61', u'emb_62', u'emb_63', u'emb_64', u'emb_65', u'emb_66', u'emb_67', u'emb_68', u'emb_69', u'emb_70', u'emb_71', u'emb_72', u'emb_73', u'emb_74', u'emb_75', u'emb_76', u'emb_77', u'emb_78', u'emb_79', u'emb_80', u'emb_81', u'emb_82', u'emb_83', u'emb_84', u'emb_85', u'emb_86', u'emb_87', u'emb_88', u'emb_89', u'emb_90', u'emb_91', u'emb_92', u'emb_93', u'emb_94', u'emb_95', u'emb_96', u'emb_97', u'emb_98', u'emb_99', u'most_reordered_aiesle', u'most_reordered_dpmt', u'top1_reordered_pid', u'top2_reordered_pid', u'top3_reordered_pid']

In [37]:
del d_train, d_valid
gc.collect()


Out[37]:
63

In [21]:
print("split the train and validation set")
X_train, X_valid, y_train, y_valid = train_test_split(df_train[f_to_use], labels, test_size = 0.3, random_state=2017)


split the train and validation set

In [22]:
X_train = pd.DataFrame(X_train, columns = f_to_use)
X_valid = pd.DataFrame(X_valid, columns = f_to_use)

In [23]:
print('formating training and validation dataset for lgb')
d_train = lgb.Dataset(X_train,
                      label=y_train,
                      categorical_feature=['aisle_id', 'department_id', 'most_reordered_aiesle',
                                           'most_reordered_dpmt','top1_reordered_pid', 'top2_reordered_pid', 
                                           'top3_reordered_pid' ])  
d_valid = lgb.Dataset(X_valid,
                      label=y_valid,
                      categorical_feature=['aisle_id', 'department_id', 'most_reordered_aiesle', 'most_reordered_dpmt',
                                           'top1_reordered_pid', 'top2_reordered_pid', 
                                           'top3_reordered_pid' ])


formating training and validation dataset for lgb

In [24]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 80,
    'max_depth': 10,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.9,
    'bagging_freq': 8
}
ROUNDS = 500

In [25]:
print('Training light GBM ...')
bst = lgb.train(params, d_train, valid_sets= [d_valid], num_boost_round= ROUNDS, early_stopping_rounds = 10)


Training light GBM ...
[1]	valid_0's binary_logloss: 0.629905
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's binary_logloss: 0.57822
[3]	valid_0's binary_logloss: 0.535383
[4]	valid_0's binary_logloss: 0.499574
[5]	valid_0's binary_logloss: 0.469468
[6]	valid_0's binary_logloss: 0.443913
[7]	valid_0's binary_logloss: 0.422222
[8]	valid_0's binary_logloss: 0.403857
[9]	valid_0's binary_logloss: 0.387963
[10]	valid_0's binary_logloss: 0.374341
[11]	valid_0's binary_logloss: 0.362801
[12]	valid_0's binary_logloss: 0.35273
[13]	valid_0's binary_logloss: 0.344388
[14]	valid_0's binary_logloss: 0.336879
[15]	valid_0's binary_logloss: 0.3306
[16]	valid_0's binary_logloss: 0.324991
[17]	valid_0's binary_logloss: 0.320155
[18]	valid_0's binary_logloss: 0.316072
[19]	valid_0's binary_logloss: 0.31262
[20]	valid_0's binary_logloss: 0.309473
[21]	valid_0's binary_logloss: 0.306745
[22]	valid_0's binary_logloss: 0.304406
[23]	valid_0's binary_logloss: 0.302421
[24]	valid_0's binary_logloss: 0.300715
[25]	valid_0's binary_logloss: 0.299175
[26]	valid_0's binary_logloss: 0.297889
[27]	valid_0's binary_logloss: 0.296738
[28]	valid_0's binary_logloss: 0.295749
[29]	valid_0's binary_logloss: 0.295034
[30]	valid_0's binary_logloss: 0.294258
[31]	valid_0's binary_logloss: 0.293566
[32]	valid_0's binary_logloss: 0.292972
[33]	valid_0's binary_logloss: 0.292452
[34]	valid_0's binary_logloss: 0.292
[35]	valid_0's binary_logloss: 0.291595
[36]	valid_0's binary_logloss: 0.291231
[37]	valid_0's binary_logloss: 0.290943
[38]	valid_0's binary_logloss: 0.290634
[39]	valid_0's binary_logloss: 0.290386
[40]	valid_0's binary_logloss: 0.290164
[41]	valid_0's binary_logloss: 0.289945
[42]	valid_0's binary_logloss: 0.289738
[43]	valid_0's binary_logloss: 0.289567
[44]	valid_0's binary_logloss: 0.2894
[45]	valid_0's binary_logloss: 0.289225
[46]	valid_0's binary_logloss: 0.289104
[47]	valid_0's binary_logloss: 0.288988
[48]	valid_0's binary_logloss: 0.288886
[49]	valid_0's binary_logloss: 0.288785
[50]	valid_0's binary_logloss: 0.288653
[51]	valid_0's binary_logloss: 0.288558
[52]	valid_0's binary_logloss: 0.288464
[53]	valid_0's binary_logloss: 0.288376
[54]	valid_0's binary_logloss: 0.288272
[55]	valid_0's binary_logloss: 0.288182
[56]	valid_0's binary_logloss: 0.288115
[57]	valid_0's binary_logloss: 0.288027
[58]	valid_0's binary_logloss: 0.287951
[59]	valid_0's binary_logloss: 0.287876
[60]	valid_0's binary_logloss: 0.28781
[61]	valid_0's binary_logloss: 0.287755
[62]	valid_0's binary_logloss: 0.287684
[63]	valid_0's binary_logloss: 0.287617
[64]	valid_0's binary_logloss: 0.287567
[65]	valid_0's binary_logloss: 0.287485
[66]	valid_0's binary_logloss: 0.287429
[67]	valid_0's binary_logloss: 0.28737
[68]	valid_0's binary_logloss: 0.287296
[69]	valid_0's binary_logloss: 0.287247
[70]	valid_0's binary_logloss: 0.2872
[71]	valid_0's binary_logloss: 0.287157
[72]	valid_0's binary_logloss: 0.287122
[73]	valid_0's binary_logloss: 0.287071
[74]	valid_0's binary_logloss: 0.287039
[75]	valid_0's binary_logloss: 0.286994
[76]	valid_0's binary_logloss: 0.286956
[77]	valid_0's binary_logloss: 0.286915
[78]	valid_0's binary_logloss: 0.286864
[79]	valid_0's binary_logloss: 0.286832
[80]	valid_0's binary_logloss: 0.286789
[81]	valid_0's binary_logloss: 0.286761
[82]	valid_0's binary_logloss: 0.286724
[83]	valid_0's binary_logloss: 0.28669
[84]	valid_0's binary_logloss: 0.286652
[85]	valid_0's binary_logloss: 0.286618
[86]	valid_0's binary_logloss: 0.286582
[87]	valid_0's binary_logloss: 0.286552
[88]	valid_0's binary_logloss: 0.286519
[89]	valid_0's binary_logloss: 0.286492
[90]	valid_0's binary_logloss: 0.286457
[91]	valid_0's binary_logloss: 0.286432
[92]	valid_0's binary_logloss: 0.286378
[93]	valid_0's binary_logloss: 0.286354
[94]	valid_0's binary_logloss: 0.286329
[95]	valid_0's binary_logloss: 0.286301
[96]	valid_0's binary_logloss: 0.286276
[97]	valid_0's binary_logloss: 0.28625
[98]	valid_0's binary_logloss: 0.286197
[99]	valid_0's binary_logloss: 0.286171
[100]	valid_0's binary_logloss: 0.286147
[101]	valid_0's binary_logloss: 0.286121
[102]	valid_0's binary_logloss: 0.286096
[103]	valid_0's binary_logloss: 0.286072
[104]	valid_0's binary_logloss: 0.286037
[105]	valid_0's binary_logloss: 0.286013
[106]	valid_0's binary_logloss: 0.285964
[107]	valid_0's binary_logloss: 0.285942
[108]	valid_0's binary_logloss: 0.28592
[109]	valid_0's binary_logloss: 0.285902
[110]	valid_0's binary_logloss: 0.285885
[111]	valid_0's binary_logloss: 0.285859
[112]	valid_0's binary_logloss: 0.285827
[113]	valid_0's binary_logloss: 0.285809
[114]	valid_0's binary_logloss: 0.285785
[115]	valid_0's binary_logloss: 0.285766
[116]	valid_0's binary_logloss: 0.285751
[117]	valid_0's binary_logloss: 0.285725
[118]	valid_0's binary_logloss: 0.285708
[119]	valid_0's binary_logloss: 0.285686
[120]	valid_0's binary_logloss: 0.285667
[121]	valid_0's binary_logloss: 0.285653
[122]	valid_0's binary_logloss: 0.285625
[123]	valid_0's binary_logloss: 0.285608
[124]	valid_0's binary_logloss: 0.285584
[125]	valid_0's binary_logloss: 0.285548
[126]	valid_0's binary_logloss: 0.285528
[127]	valid_0's binary_logloss: 0.285512
[128]	valid_0's binary_logloss: 0.285498
[129]	valid_0's binary_logloss: 0.285472
[130]	valid_0's binary_logloss: 0.285454
[131]	valid_0's binary_logloss: 0.285437
[132]	valid_0's binary_logloss: 0.285426
[133]	valid_0's binary_logloss: 0.285397
[134]	valid_0's binary_logloss: 0.28538
[135]	valid_0's binary_logloss: 0.285363
[136]	valid_0's binary_logloss: 0.285348
[137]	valid_0's binary_logloss: 0.285332
[138]	valid_0's binary_logloss: 0.285317
[139]	valid_0's binary_logloss: 0.285304
[140]	valid_0's binary_logloss: 0.285285
[141]	valid_0's binary_logloss: 0.285268
[142]	valid_0's binary_logloss: 0.285258
[143]	valid_0's binary_logloss: 0.285246
[144]	valid_0's binary_logloss: 0.285235
[145]	valid_0's binary_logloss: 0.285222
[146]	valid_0's binary_logloss: 0.285205
[147]	valid_0's binary_logloss: 0.285195
[148]	valid_0's binary_logloss: 0.28518
[149]	valid_0's binary_logloss: 0.285164
[150]	valid_0's binary_logloss: 0.285144
[151]	valid_0's binary_logloss: 0.285126
[152]	valid_0's binary_logloss: 0.285109
[153]	valid_0's binary_logloss: 0.285091
[154]	valid_0's binary_logloss: 0.285066
[155]	valid_0's binary_logloss: 0.28505
[156]	valid_0's binary_logloss: 0.28504
[157]	valid_0's binary_logloss: 0.285023
[158]	valid_0's binary_logloss: 0.28501
[159]	valid_0's binary_logloss: 0.285007
[160]	valid_0's binary_logloss: 0.284995
[161]	valid_0's binary_logloss: 0.284982
[162]	valid_0's binary_logloss: 0.284971
[163]	valid_0's binary_logloss: 0.28496
[164]	valid_0's binary_logloss: 0.284948
[165]	valid_0's binary_logloss: 0.284936
[166]	valid_0's binary_logloss: 0.284923
[167]	valid_0's binary_logloss: 0.284909
[168]	valid_0's binary_logloss: 0.284904
[169]	valid_0's binary_logloss: 0.284892
[170]	valid_0's binary_logloss: 0.284884
[171]	valid_0's binary_logloss: 0.284875
[172]	valid_0's binary_logloss: 0.284874
[173]	valid_0's binary_logloss: 0.284872
[174]	valid_0's binary_logloss: 0.284859
[175]	valid_0's binary_logloss: 0.284848
[176]	valid_0's binary_logloss: 0.284825
[177]	valid_0's binary_logloss: 0.28481
[178]	valid_0's binary_logloss: 0.284791
[179]	valid_0's binary_logloss: 0.28478
[180]	valid_0's binary_logloss: 0.28477
[181]	valid_0's binary_logloss: 0.284765
[182]	valid_0's binary_logloss: 0.284751
[183]	valid_0's binary_logloss: 0.28474
[184]	valid_0's binary_logloss: 0.284726
[185]	valid_0's binary_logloss: 0.284717
[186]	valid_0's binary_logloss: 0.284707
[187]	valid_0's binary_logloss: 0.284697
[188]	valid_0's binary_logloss: 0.284692
[189]	valid_0's binary_logloss: 0.284671
[190]	valid_0's binary_logloss: 0.284657
[191]	valid_0's binary_logloss: 0.284644
[192]	valid_0's binary_logloss: 0.284631
[193]	valid_0's binary_logloss: 0.284613
[194]	valid_0's binary_logloss: 0.284611
[195]	valid_0's binary_logloss: 0.284598
[196]	valid_0's binary_logloss: 0.284596
[197]	valid_0's binary_logloss: 0.284582
[198]	valid_0's binary_logloss: 0.284572
[199]	valid_0's binary_logloss: 0.284568
[200]	valid_0's binary_logloss: 0.284559
[201]	valid_0's binary_logloss: 0.284552
[202]	valid_0's binary_logloss: 0.284544
[203]	valid_0's binary_logloss: 0.284533
[204]	valid_0's binary_logloss: 0.284516
[205]	valid_0's binary_logloss: 0.284507
[206]	valid_0's binary_logloss: 0.284502
[207]	valid_0's binary_logloss: 0.284493
[208]	valid_0's binary_logloss: 0.284481
[209]	valid_0's binary_logloss: 0.284474
[210]	valid_0's binary_logloss: 0.284458
[211]	valid_0's binary_logloss: 0.284443
[212]	valid_0's binary_logloss: 0.284435
[213]	valid_0's binary_logloss: 0.284416
[214]	valid_0's binary_logloss: 0.284405
[215]	valid_0's binary_logloss: 0.284397
[216]	valid_0's binary_logloss: 0.284391
[217]	valid_0's binary_logloss: 0.28438
[218]	valid_0's binary_logloss: 0.284372
[219]	valid_0's binary_logloss: 0.284365
[220]	valid_0's binary_logloss: 0.28436
[221]	valid_0's binary_logloss: 0.28435
[222]	valid_0's binary_logloss: 0.284339
[223]	valid_0's binary_logloss: 0.28433
[224]	valid_0's binary_logloss: 0.284322
[225]	valid_0's binary_logloss: 0.284307
[226]	valid_0's binary_logloss: 0.284296
[227]	valid_0's binary_logloss: 0.284279
[228]	valid_0's binary_logloss: 0.284275
[229]	valid_0's binary_logloss: 0.284263
[230]	valid_0's binary_logloss: 0.284262
[231]	valid_0's binary_logloss: 0.284257
[232]	valid_0's binary_logloss: 0.284246
[233]	valid_0's binary_logloss: 0.284243
[234]	valid_0's binary_logloss: 0.284239
[235]	valid_0's binary_logloss: 0.284236
[236]	valid_0's binary_logloss: 0.284231
[237]	valid_0's binary_logloss: 0.284225
[238]	valid_0's binary_logloss: 0.284217
[239]	valid_0's binary_logloss: 0.284209
[240]	valid_0's binary_logloss: 0.284198
[241]	valid_0's binary_logloss: 0.284189
[242]	valid_0's binary_logloss: 0.284181
[243]	valid_0's binary_logloss: 0.284165
[244]	valid_0's binary_logloss: 0.284154
[245]	valid_0's binary_logloss: 0.284142
[246]	valid_0's binary_logloss: 0.284136
[247]	valid_0's binary_logloss: 0.284128
[248]	valid_0's binary_logloss: 0.284118
[249]	valid_0's binary_logloss: 0.284112
[250]	valid_0's binary_logloss: 0.284108
[251]	valid_0's binary_logloss: 0.284104
[252]	valid_0's binary_logloss: 0.284094
[253]	valid_0's binary_logloss: 0.284086
[254]	valid_0's binary_logloss: 0.284079
[255]	valid_0's binary_logloss: 0.284073
[256]	valid_0's binary_logloss: 0.284066
[257]	valid_0's binary_logloss: 0.28406
[258]	valid_0's binary_logloss: 0.284051
[259]	valid_0's binary_logloss: 0.284041
[260]	valid_0's binary_logloss: 0.284033
[261]	valid_0's binary_logloss: 0.284021
[262]	valid_0's binary_logloss: 0.284014
[263]	valid_0's binary_logloss: 0.284009
[264]	valid_0's binary_logloss: 0.283999
[265]	valid_0's binary_logloss: 0.283989
[266]	valid_0's binary_logloss: 0.283981
[267]	valid_0's binary_logloss: 0.283975
[268]	valid_0's binary_logloss: 0.283965
[269]	valid_0's binary_logloss: 0.28396
[270]	valid_0's binary_logloss: 0.283951
[271]	valid_0's binary_logloss: 0.283943
[272]	valid_0's binary_logloss: 0.283931
[273]	valid_0's binary_logloss: 0.283921
[274]	valid_0's binary_logloss: 0.283903
[275]	valid_0's binary_logloss: 0.283892
[276]	valid_0's binary_logloss: 0.283888
[277]	valid_0's binary_logloss: 0.283876
[278]	valid_0's binary_logloss: 0.283864
[279]	valid_0's binary_logloss: 0.283848
[280]	valid_0's binary_logloss: 0.283841
[281]	valid_0's binary_logloss: 0.28384
[282]	valid_0's binary_logloss: 0.283835
[283]	valid_0's binary_logloss: 0.283829
[284]	valid_0's binary_logloss: 0.283823
[285]	valid_0's binary_logloss: 0.283817
[286]	valid_0's binary_logloss: 0.283813
[287]	valid_0's binary_logloss: 0.283809
[288]	valid_0's binary_logloss: 0.283797
[289]	valid_0's binary_logloss: 0.28379
[290]	valid_0's binary_logloss: 0.283777
[291]	valid_0's binary_logloss: 0.283765
[292]	valid_0's binary_logloss: 0.283749
[293]	valid_0's binary_logloss: 0.283739
[294]	valid_0's binary_logloss: 0.283735
[295]	valid_0's binary_logloss: 0.283728
[296]	valid_0's binary_logloss: 0.283718
[297]	valid_0's binary_logloss: 0.283707
[298]	valid_0's binary_logloss: 0.283701
[299]	valid_0's binary_logloss: 0.283689
[300]	valid_0's binary_logloss: 0.283685
[301]	valid_0's binary_logloss: 0.283676
[302]	valid_0's binary_logloss: 0.283675
[303]	valid_0's binary_logloss: 0.283669
[304]	valid_0's binary_logloss: 0.283649
[305]	valid_0's binary_logloss: 0.283641
[306]	valid_0's binary_logloss: 0.283632
[307]	valid_0's binary_logloss: 0.283626
[308]	valid_0's binary_logloss: 0.283622
[309]	valid_0's binary_logloss: 0.283619
[310]	valid_0's binary_logloss: 0.283615
[311]	valid_0's binary_logloss: 0.28361
[312]	valid_0's binary_logloss: 0.283606
[313]	valid_0's binary_logloss: 0.283601
[314]	valid_0's binary_logloss: 0.283596
[315]	valid_0's binary_logloss: 0.283588
[316]	valid_0's binary_logloss: 0.283581
[317]	valid_0's binary_logloss: 0.283575
[318]	valid_0's binary_logloss: 0.283564
[319]	valid_0's binary_logloss: 0.28356
[320]	valid_0's binary_logloss: 0.283556
[321]	valid_0's binary_logloss: 0.283548
[322]	valid_0's binary_logloss: 0.283541
[323]	valid_0's binary_logloss: 0.283532
[324]	valid_0's binary_logloss: 0.283511
[325]	valid_0's binary_logloss: 0.283504
[326]	valid_0's binary_logloss: 0.283501
[327]	valid_0's binary_logloss: 0.283495
[328]	valid_0's binary_logloss: 0.283488
[329]	valid_0's binary_logloss: 0.283478
[330]	valid_0's binary_logloss: 0.283473
[331]	valid_0's binary_logloss: 0.283473
[332]	valid_0's binary_logloss: 0.283465
[333]	valid_0's binary_logloss: 0.283453
[334]	valid_0's binary_logloss: 0.283445
[335]	valid_0's binary_logloss: 0.283443
[336]	valid_0's binary_logloss: 0.283436
[337]	valid_0's binary_logloss: 0.283426
[338]	valid_0's binary_logloss: 0.28342
[339]	valid_0's binary_logloss: 0.283411
[340]	valid_0's binary_logloss: 0.283408
[341]	valid_0's binary_logloss: 0.283405
[342]	valid_0's binary_logloss: 0.283404
[343]	valid_0's binary_logloss: 0.283398
[344]	valid_0's binary_logloss: 0.283394
[345]	valid_0's binary_logloss: 0.283389
[346]	valid_0's binary_logloss: 0.283379
[347]	valid_0's binary_logloss: 0.28337
[348]	valid_0's binary_logloss: 0.283367
[349]	valid_0's binary_logloss: 0.283359
[350]	valid_0's binary_logloss: 0.283358
[351]	valid_0's binary_logloss: 0.283354
[352]	valid_0's binary_logloss: 0.283338
[353]	valid_0's binary_logloss: 0.283334
[354]	valid_0's binary_logloss: 0.283327
[355]	valid_0's binary_logloss: 0.283321
[356]	valid_0's binary_logloss: 0.28332
[357]	valid_0's binary_logloss: 0.283315
[358]	valid_0's binary_logloss: 0.283313
[359]	valid_0's binary_logloss: 0.283301
[360]	valid_0's binary_logloss: 0.283299
[361]	valid_0's binary_logloss: 0.283292
[362]	valid_0's binary_logloss: 0.283283
[363]	valid_0's binary_logloss: 0.283273
[364]	valid_0's binary_logloss: 0.283272
[365]	valid_0's binary_logloss: 0.283269
[366]	valid_0's binary_logloss: 0.283265
[367]	valid_0's binary_logloss: 0.283255
[368]	valid_0's binary_logloss: 0.283244
[369]	valid_0's binary_logloss: 0.283235
[370]	valid_0's binary_logloss: 0.283224
[371]	valid_0's binary_logloss: 0.283218
[372]	valid_0's binary_logloss: 0.283213
[373]	valid_0's binary_logloss: 0.283207
[374]	valid_0's binary_logloss: 0.283204
[375]	valid_0's binary_logloss: 0.283201
[376]	valid_0's binary_logloss: 0.283195
[377]	valid_0's binary_logloss: 0.283195
[378]	valid_0's binary_logloss: 0.283193
[379]	valid_0's binary_logloss: 0.28319
[380]	valid_0's binary_logloss: 0.283181
[381]	valid_0's binary_logloss: 0.28317
[382]	valid_0's binary_logloss: 0.283166
[383]	valid_0's binary_logloss: 0.283161
[384]	valid_0's binary_logloss: 0.283158
[385]	valid_0's binary_logloss: 0.283152
[386]	valid_0's binary_logloss: 0.283143
[387]	valid_0's binary_logloss: 0.283133
[388]	valid_0's binary_logloss: 0.283124
[389]	valid_0's binary_logloss: 0.283124
[390]	valid_0's binary_logloss: 0.283117
[391]	valid_0's binary_logloss: 0.283115
[392]	valid_0's binary_logloss: 0.283114
[393]	valid_0's binary_logloss: 0.28311
[394]	valid_0's binary_logloss: 0.283102
[395]	valid_0's binary_logloss: 0.283098
[396]	valid_0's binary_logloss: 0.283088
[397]	valid_0's binary_logloss: 0.283087
[398]	valid_0's binary_logloss: 0.283085
[399]	valid_0's binary_logloss: 0.283069
[400]	valid_0's binary_logloss: 0.283067
[401]	valid_0's binary_logloss: 0.283066
[402]	valid_0's binary_logloss: 0.283056
[403]	valid_0's binary_logloss: 0.283046
[404]	valid_0's binary_logloss: 0.283045
[405]	valid_0's binary_logloss: 0.283045
[406]	valid_0's binary_logloss: 0.283038
[407]	valid_0's binary_logloss: 0.283038
[408]	valid_0's binary_logloss: 0.283036
[409]	valid_0's binary_logloss: 0.28303
[410]	valid_0's binary_logloss: 0.283025
[411]	valid_0's binary_logloss: 0.283021
[412]	valid_0's binary_logloss: 0.283012
[413]	valid_0's binary_logloss: 0.28301
[414]	valid_0's binary_logloss: 0.283003
[415]	valid_0's binary_logloss: 0.282998
[416]	valid_0's binary_logloss: 0.282991
[417]	valid_0's binary_logloss: 0.282986
[418]	valid_0's binary_logloss: 0.282985
[419]	valid_0's binary_logloss: 0.282984
[420]	valid_0's binary_logloss: 0.282978
[421]	valid_0's binary_logloss: 0.282968
[422]	valid_0's binary_logloss: 0.282959
[423]	valid_0's binary_logloss: 0.282953
[424]	valid_0's binary_logloss: 0.28295
[425]	valid_0's binary_logloss: 0.282941
[426]	valid_0's binary_logloss: 0.282932
[427]	valid_0's binary_logloss: 0.282929
[428]	valid_0's binary_logloss: 0.282927
[429]	valid_0's binary_logloss: 0.282921
[430]	valid_0's binary_logloss: 0.28292
[431]	valid_0's binary_logloss: 0.282918
[432]	valid_0's binary_logloss: 0.282915
[433]	valid_0's binary_logloss: 0.282913
[434]	valid_0's binary_logloss: 0.282909
[435]	valid_0's binary_logloss: 0.282902
[436]	valid_0's binary_logloss: 0.282901
[437]	valid_0's binary_logloss: 0.2829
[438]	valid_0's binary_logloss: 0.282898
[439]	valid_0's binary_logloss: 0.282893
[440]	valid_0's binary_logloss: 0.282891
[441]	valid_0's binary_logloss: 0.282889
[442]	valid_0's binary_logloss: 0.282887
[443]	valid_0's binary_logloss: 0.282884
[444]	valid_0's binary_logloss: 0.282874
[445]	valid_0's binary_logloss: 0.28287
[446]	valid_0's binary_logloss: 0.282867
[447]	valid_0's binary_logloss: 0.282858
[448]	valid_0's binary_logloss: 0.282856
[449]	valid_0's binary_logloss: 0.282849
[450]	valid_0's binary_logloss: 0.282841
[451]	valid_0's binary_logloss: 0.282838
[452]	valid_0's binary_logloss: 0.282835
[453]	valid_0's binary_logloss: 0.282831
[454]	valid_0's binary_logloss: 0.282826
[455]	valid_0's binary_logloss: 0.282817
[456]	valid_0's binary_logloss: 0.282812
[457]	valid_0's binary_logloss: 0.282804
[458]	valid_0's binary_logloss: 0.282799
[459]	valid_0's binary_logloss: 0.282791
[460]	valid_0's binary_logloss: 0.282785
[461]	valid_0's binary_logloss: 0.282778
[462]	valid_0's binary_logloss: 0.28277
[463]	valid_0's binary_logloss: 0.28276
[464]	valid_0's binary_logloss: 0.282758
[465]	valid_0's binary_logloss: 0.282751
[466]	valid_0's binary_logloss: 0.282746
[467]	valid_0's binary_logloss: 0.282742
[468]	valid_0's binary_logloss: 0.282741
[469]	valid_0's binary_logloss: 0.282735
[470]	valid_0's binary_logloss: 0.282731
[471]	valid_0's binary_logloss: 0.282723
[472]	valid_0's binary_logloss: 0.28272
[473]	valid_0's binary_logloss: 0.282717
[474]	valid_0's binary_logloss: 0.282716
[475]	valid_0's binary_logloss: 0.282711
[476]	valid_0's binary_logloss: 0.282712
[477]	valid_0's binary_logloss: 0.282703
[478]	valid_0's binary_logloss: 0.282698
[479]	valid_0's binary_logloss: 0.282691
[480]	valid_0's binary_logloss: 0.282686
[481]	valid_0's binary_logloss: 0.282679
[482]	valid_0's binary_logloss: 0.282671
[483]	valid_0's binary_logloss: 0.282668
[484]	valid_0's binary_logloss: 0.282665
[485]	valid_0's binary_logloss: 0.282661
[486]	valid_0's binary_logloss: 0.282657
[487]	valid_0's binary_logloss: 0.282649
[488]	valid_0's binary_logloss: 0.282648
[489]	valid_0's binary_logloss: 0.282646
[490]	valid_0's binary_logloss: 0.282644
[491]	valid_0's binary_logloss: 0.28264
[492]	valid_0's binary_logloss: 0.282639
[493]	valid_0's binary_logloss: 0.282629
[494]	valid_0's binary_logloss: 0.282629
[495]	valid_0's binary_logloss: 0.282622
[496]	valid_0's binary_logloss: 0.282618
[497]	valid_0's binary_logloss: 0.282615
[498]	valid_0's binary_logloss: 0.282615
[499]	valid_0's binary_logloss: 0.282607
[500]	valid_0's binary_logloss: 0.282603

In [26]:
bst.save_model('../models/lightGBM_morefeatures_80_10.txt', num_iteration=bst.best_iteration)

It seems it doesn't add much more values.

Use specific product vector features


In [54]:
#df_train.drop(emb_cols, axis = 1, inplace = True)
df_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8474661 entries, 0 to 8474660
Columns: 134 entries, order_id to prod2vec_99
dtypes: float32(108), int16(3), int32(3), int8(2), object(1), uint8(17)
memory usage: 3.8+ GB

In [55]:
gc.collect()


Out[55]:
7

In [53]:
prod2vec_cols = features[features.str.startswith("prod2vec")].tolist()
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate'] + dow_cols + daytime_cols + prod2vec_cols
print(f_to_use)


['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow', 'user_average_days_between_orders', 'user_average_basket', 'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio', 'aisle_id', 'department_id', 'product_orders', 'product_reorders', 'product_reorder_rate', u'dow_0', u'dow_1', u'dow_2', u'dow_3', u'dow_4', u'dow_5', u'dow_6', u'daytime_sleeping', u'daytime_morning', u'daytime_noon', u'daytime_afternoon', u'daytime_evening', u'daytime_night', 'prod2vec_0', 'prod2vec_1', 'prod2vec_2', 'prod2vec_3', 'prod2vec_4', 'prod2vec_5', 'prod2vec_6', 'prod2vec_7', 'prod2vec_8', 'prod2vec_9', 'prod2vec_10', 'prod2vec_11', 'prod2vec_12', 'prod2vec_13', 'prod2vec_14', 'prod2vec_15', 'prod2vec_16', 'prod2vec_17', 'prod2vec_18', 'prod2vec_19', 'prod2vec_20', 'prod2vec_21', 'prod2vec_22', 'prod2vec_23', 'prod2vec_24', 'prod2vec_25', 'prod2vec_26', 'prod2vec_27', 'prod2vec_28', 'prod2vec_29', 'prod2vec_30', 'prod2vec_31', 'prod2vec_32', 'prod2vec_33', 'prod2vec_34', 'prod2vec_35', 'prod2vec_36', 'prod2vec_37', 'prod2vec_38', 'prod2vec_39', 'prod2vec_40', 'prod2vec_41', 'prod2vec_42', 'prod2vec_43', 'prod2vec_44', 'prod2vec_45', 'prod2vec_46', 'prod2vec_47', 'prod2vec_48', 'prod2vec_49', 'prod2vec_50', 'prod2vec_51', 'prod2vec_52', 'prod2vec_53', 'prod2vec_54', 'prod2vec_55', 'prod2vec_56', 'prod2vec_57', 'prod2vec_58', 'prod2vec_59', 'prod2vec_60', 'prod2vec_61', 'prod2vec_62', 'prod2vec_63', 'prod2vec_64', 'prod2vec_65', 'prod2vec_66', 'prod2vec_67', 'prod2vec_68', 'prod2vec_69', 'prod2vec_70', 'prod2vec_71', 'prod2vec_72', 'prod2vec_73', 'prod2vec_74', 'prod2vec_75', 'prod2vec_76', 'prod2vec_77', 'prod2vec_78', 'prod2vec_79', 'prod2vec_80', 'prod2vec_81', 'prod2vec_82', 'prod2vec_83', 'prod2vec_84', 'prod2vec_85', 'prod2vec_86', 'prod2vec_87', 'prod2vec_88', 'prod2vec_89', 'prod2vec_90', 'prod2vec_91', 'prod2vec_92', 'prod2vec_93', 'prod2vec_94', 'prod2vec_95', 'prod2vec_96', 'prod2vec_97', 'prod2vec_98', 'prod2vec_99']

In [56]:
print("split the train and validation set")
X_train, X_valid, y_train, y_valid = train_test_split(df_train[f_to_use], labels, test_size = 0.3, random_state=2017)


split the train and validation set

In [58]:
X_train = pd.DataFrame(X_train, columns = f_to_use)
X_valid = pd.DataFrame(X_valid, columns = f_to_use)

In [59]:
print('formating training and validation dataset for lgb')
d_train = lgb.Dataset(X_train,
                      label=y_train,
                      categorical_feature=['aisle_id', 'department_id'])  
d_valid = lgb.Dataset(X_valid,
                      label=y_valid,
                      categorical_feature=['aisle_id', 'department_id'])


formating training and validation dataset for lgb

In [60]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 100,
    'max_depth': 12,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.9,
    'bagging_freq': 8
}
ROUNDS = 500

In [61]:
gc.collect()


Out[61]:
19

In [62]:
print('Training light GBM ...')
bst_all = lgb.train(params, d_train, valid_sets= [d_valid], num_boost_round= ROUNDS, early_stopping_rounds = 10)


Training light GBM ...
[1]	valid_0's binary_logloss: 0.629861
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's binary_logloss: 0.578073
[3]	valid_0's binary_logloss: 0.535196
[4]	valid_0's binary_logloss: 0.499764
[5]	valid_0's binary_logloss: 0.469615
[6]	valid_0's binary_logloss: 0.44406
[7]	valid_0's binary_logloss: 0.422511
[8]	valid_0's binary_logloss: 0.403941
[9]	valid_0's binary_logloss: 0.388121
[10]	valid_0's binary_logloss: 0.374644
[11]	valid_0's binary_logloss: 0.363028
[12]	valid_0's binary_logloss: 0.352877
[13]	valid_0's binary_logloss: 0.34415
[14]	valid_0's binary_logloss: 0.336623
[15]	valid_0's binary_logloss: 0.33029
[16]	valid_0's binary_logloss: 0.324971
[17]	valid_0's binary_logloss: 0.320195
[18]	valid_0's binary_logloss: 0.316005
[19]	valid_0's binary_logloss: 0.312523
[20]	valid_0's binary_logloss: 0.309494
[21]	valid_0's binary_logloss: 0.306809
[22]	valid_0's binary_logloss: 0.304419
[23]	valid_0's binary_logloss: 0.302441
[24]	valid_0's binary_logloss: 0.300757
[25]	valid_0's binary_logloss: 0.299179
[26]	valid_0's binary_logloss: 0.297899
[27]	valid_0's binary_logloss: 0.296727
[28]	valid_0's binary_logloss: 0.29579
[29]	valid_0's binary_logloss: 0.294863
[30]	valid_0's binary_logloss: 0.294069
[31]	valid_0's binary_logloss: 0.293394
[32]	valid_0's binary_logloss: 0.292806
[33]	valid_0's binary_logloss: 0.292265
[34]	valid_0's binary_logloss: 0.291791
[35]	valid_0's binary_logloss: 0.291395
[36]	valid_0's binary_logloss: 0.291043
[37]	valid_0's binary_logloss: 0.290739
[38]	valid_0's binary_logloss: 0.290433
[39]	valid_0's binary_logloss: 0.290177
[40]	valid_0's binary_logloss: 0.289928
[41]	valid_0's binary_logloss: 0.289729
[42]	valid_0's binary_logloss: 0.289542
[43]	valid_0's binary_logloss: 0.289365
[44]	valid_0's binary_logloss: 0.289221
[45]	valid_0's binary_logloss: 0.289092
[46]	valid_0's binary_logloss: 0.288908
[47]	valid_0's binary_logloss: 0.288806
[48]	valid_0's binary_logloss: 0.288661
[49]	valid_0's binary_logloss: 0.288556
[50]	valid_0's binary_logloss: 0.288456
[51]	valid_0's binary_logloss: 0.288328
[52]	valid_0's binary_logloss: 0.28824
[53]	valid_0's binary_logloss: 0.288158
[54]	valid_0's binary_logloss: 0.288058
[55]	valid_0's binary_logloss: 0.28798
[56]	valid_0's binary_logloss: 0.28791
[57]	valid_0's binary_logloss: 0.287823
[58]	valid_0's binary_logloss: 0.287764
[59]	valid_0's binary_logloss: 0.287714
[60]	valid_0's binary_logloss: 0.287635
[61]	valid_0's binary_logloss: 0.287587
[62]	valid_0's binary_logloss: 0.287539
[63]	valid_0's binary_logloss: 0.287472
[64]	valid_0's binary_logloss: 0.287433
[65]	valid_0's binary_logloss: 0.287384
[66]	valid_0's binary_logloss: 0.287343
[67]	valid_0's binary_logloss: 0.287288
[68]	valid_0's binary_logloss: 0.28724
[69]	valid_0's binary_logloss: 0.287203
[70]	valid_0's binary_logloss: 0.287176
[71]	valid_0's binary_logloss: 0.287139
[72]	valid_0's binary_logloss: 0.287109
[73]	valid_0's binary_logloss: 0.287065
[74]	valid_0's binary_logloss: 0.287024
[75]	valid_0's binary_logloss: 0.286987
[76]	valid_0's binary_logloss: 0.286949
[77]	valid_0's binary_logloss: 0.286924
[78]	valid_0's binary_logloss: 0.286893
[79]	valid_0's binary_logloss: 0.286873
[80]	valid_0's binary_logloss: 0.286843
[81]	valid_0's binary_logloss: 0.286813
[82]	valid_0's binary_logloss: 0.286789
[83]	valid_0's binary_logloss: 0.286766
[84]	valid_0's binary_logloss: 0.286746
[85]	valid_0's binary_logloss: 0.286729
[86]	valid_0's binary_logloss: 0.286711
[87]	valid_0's binary_logloss: 0.286687
[88]	valid_0's binary_logloss: 0.28667
[89]	valid_0's binary_logloss: 0.286649
[90]	valid_0's binary_logloss: 0.286628
[91]	valid_0's binary_logloss: 0.286603
[92]	valid_0's binary_logloss: 0.286586
[93]	valid_0's binary_logloss: 0.28657
[94]	valid_0's binary_logloss: 0.286553
[95]	valid_0's binary_logloss: 0.286538
[96]	valid_0's binary_logloss: 0.28652
[97]	valid_0's binary_logloss: 0.286501
[98]	valid_0's binary_logloss: 0.286486
[99]	valid_0's binary_logloss: 0.286475
[100]	valid_0's binary_logloss: 0.286457
[101]	valid_0's binary_logloss: 0.286446
[102]	valid_0's binary_logloss: 0.286432
[103]	valid_0's binary_logloss: 0.286413
[104]	valid_0's binary_logloss: 0.286396
[105]	valid_0's binary_logloss: 0.28638
[106]	valid_0's binary_logloss: 0.286367
[107]	valid_0's binary_logloss: 0.286351
[108]	valid_0's binary_logloss: 0.28634
[109]	valid_0's binary_logloss: 0.286331
[110]	valid_0's binary_logloss: 0.286322
[111]	valid_0's binary_logloss: 0.286311
[112]	valid_0's binary_logloss: 0.286302
[113]	valid_0's binary_logloss: 0.286293
[114]	valid_0's binary_logloss: 0.28628
[115]	valid_0's binary_logloss: 0.286269
[116]	valid_0's binary_logloss: 0.286259
[117]	valid_0's binary_logloss: 0.286247
[118]	valid_0's binary_logloss: 0.286237
[119]	valid_0's binary_logloss: 0.286227
[120]	valid_0's binary_logloss: 0.286214
[121]	valid_0's binary_logloss: 0.28621
[122]	valid_0's binary_logloss: 0.286202
[123]	valid_0's binary_logloss: 0.286194
[124]	valid_0's binary_logloss: 0.286187
[125]	valid_0's binary_logloss: 0.286179
[126]	valid_0's binary_logloss: 0.286174
[127]	valid_0's binary_logloss: 0.286172
[128]	valid_0's binary_logloss: 0.286157
[129]	valid_0's binary_logloss: 0.286154
[130]	valid_0's binary_logloss: 0.286149
[131]	valid_0's binary_logloss: 0.286141
[132]	valid_0's binary_logloss: 0.286134
[133]	valid_0's binary_logloss: 0.286127
[134]	valid_0's binary_logloss: 0.286123
[135]	valid_0's binary_logloss: 0.286118
[136]	valid_0's binary_logloss: 0.286112
[137]	valid_0's binary_logloss: 0.286102
[138]	valid_0's binary_logloss: 0.286093
[139]	valid_0's binary_logloss: 0.286086
[140]	valid_0's binary_logloss: 0.286077
[141]	valid_0's binary_logloss: 0.286072
[142]	valid_0's binary_logloss: 0.286068
[143]	valid_0's binary_logloss: 0.286064
[144]	valid_0's binary_logloss: 0.28606
[145]	valid_0's binary_logloss: 0.286051
[146]	valid_0's binary_logloss: 0.286042
[147]	valid_0's binary_logloss: 0.286035
[148]	valid_0's binary_logloss: 0.28603
[149]	valid_0's binary_logloss: 0.286023
[150]	valid_0's binary_logloss: 0.286019
[151]	valid_0's binary_logloss: 0.286015
[152]	valid_0's binary_logloss: 0.286007
[153]	valid_0's binary_logloss: 0.286003
[154]	valid_0's binary_logloss: 0.286
[155]	valid_0's binary_logloss: 0.285995
[156]	valid_0's binary_logloss: 0.285988
[157]	valid_0's binary_logloss: 0.285988
[158]	valid_0's binary_logloss: 0.285984
[159]	valid_0's binary_logloss: 0.285979
[160]	valid_0's binary_logloss: 0.285972
[161]	valid_0's binary_logloss: 0.285966
[162]	valid_0's binary_logloss: 0.285961
[163]	valid_0's binary_logloss: 0.285957
[164]	valid_0's binary_logloss: 0.285952
[165]	valid_0's binary_logloss: 0.285944
[166]	valid_0's binary_logloss: 0.285941
[167]	valid_0's binary_logloss: 0.285935
[168]	valid_0's binary_logloss: 0.285933
[169]	valid_0's binary_logloss: 0.285928
[170]	valid_0's binary_logloss: 0.285926
[171]	valid_0's binary_logloss: 0.285919
[172]	valid_0's binary_logloss: 0.285917
[173]	valid_0's binary_logloss: 0.285913
[174]	valid_0's binary_logloss: 0.285906
[175]	valid_0's binary_logloss: 0.285899
[176]	valid_0's binary_logloss: 0.28589
[177]	valid_0's binary_logloss: 0.285889
[178]	valid_0's binary_logloss: 0.285888
[179]	valid_0's binary_logloss: 0.285887
[180]	valid_0's binary_logloss: 0.285885
[181]	valid_0's binary_logloss: 0.285882
[182]	valid_0's binary_logloss: 0.285882
[183]	valid_0's binary_logloss: 0.28588
[184]	valid_0's binary_logloss: 0.285875
[185]	valid_0's binary_logloss: 0.28587
[186]	valid_0's binary_logloss: 0.285871
[187]	valid_0's binary_logloss: 0.285867
[188]	valid_0's binary_logloss: 0.285862
[189]	valid_0's binary_logloss: 0.28585
[190]	valid_0's binary_logloss: 0.285851
[191]	valid_0's binary_logloss: 0.285848
[192]	valid_0's binary_logloss: 0.285845
[193]	valid_0's binary_logloss: 0.285844
[194]	valid_0's binary_logloss: 0.285841
[195]	valid_0's binary_logloss: 0.285839
[196]	valid_0's binary_logloss: 0.285837
[197]	valid_0's binary_logloss: 0.285833
[198]	valid_0's binary_logloss: 0.285832
[199]	valid_0's binary_logloss: 0.285829
[200]	valid_0's binary_logloss: 0.285825
[201]	valid_0's binary_logloss: 0.285822
[202]	valid_0's binary_logloss: 0.285819
[203]	valid_0's binary_logloss: 0.285818
[204]	valid_0's binary_logloss: 0.285817
[205]	valid_0's binary_logloss: 0.285816
[206]	valid_0's binary_logloss: 0.285813
[207]	valid_0's binary_logloss: 0.285811
[208]	valid_0's binary_logloss: 0.285811
[209]	valid_0's binary_logloss: 0.285807
[210]	valid_0's binary_logloss: 0.285803
[211]	valid_0's binary_logloss: 0.285803
[212]	valid_0's binary_logloss: 0.285801
[213]	valid_0's binary_logloss: 0.2858
[214]	valid_0's binary_logloss: 0.2858
[215]	valid_0's binary_logloss: 0.285801
[216]	valid_0's binary_logloss: 0.285796
[217]	valid_0's binary_logloss: 0.285792
[218]	valid_0's binary_logloss: 0.285792
[219]	valid_0's binary_logloss: 0.285793
[220]	valid_0's binary_logloss: 0.285786
[221]	valid_0's binary_logloss: 0.285783
[222]	valid_0's binary_logloss: 0.285781
[223]	valid_0's binary_logloss: 0.285779
[224]	valid_0's binary_logloss: 0.285777
[225]	valid_0's binary_logloss: 0.285774
[226]	valid_0's binary_logloss: 0.285771
[227]	valid_0's binary_logloss: 0.28577
[228]	valid_0's binary_logloss: 0.285769
[229]	valid_0's binary_logloss: 0.285768
[230]	valid_0's binary_logloss: 0.285769
[231]	valid_0's binary_logloss: 0.285764
[232]	valid_0's binary_logloss: 0.285762
[233]	valid_0's binary_logloss: 0.28576
[234]	valid_0's binary_logloss: 0.28576
[235]	valid_0's binary_logloss: 0.285761
[236]	valid_0's binary_logloss: 0.285757
[237]	valid_0's binary_logloss: 0.285758
[238]	valid_0's binary_logloss: 0.285757
[239]	valid_0's binary_logloss: 0.285751
[240]	valid_0's binary_logloss: 0.285751
[241]	valid_0's binary_logloss: 0.285748
[242]	valid_0's binary_logloss: 0.285748
[243]	valid_0's binary_logloss: 0.285747
[244]	valid_0's binary_logloss: 0.285745
[245]	valid_0's binary_logloss: 0.285741
[246]	valid_0's binary_logloss: 0.285741
[247]	valid_0's binary_logloss: 0.285739
[248]	valid_0's binary_logloss: 0.285737
[249]	valid_0's binary_logloss: 0.285733
[250]	valid_0's binary_logloss: 0.285732
[251]	valid_0's binary_logloss: 0.28573
[252]	valid_0's binary_logloss: 0.285724
[253]	valid_0's binary_logloss: 0.285725
[254]	valid_0's binary_logloss: 0.285727
[255]	valid_0's binary_logloss: 0.285725
[256]	valid_0's binary_logloss: 0.285722
[257]	valid_0's binary_logloss: 0.28572
[258]	valid_0's binary_logloss: 0.285718
[259]	valid_0's binary_logloss: 0.285717
[260]	valid_0's binary_logloss: 0.285716
[261]	valid_0's binary_logloss: 0.285715
[262]	valid_0's binary_logloss: 0.285713
[263]	valid_0's binary_logloss: 0.285709
[264]	valid_0's binary_logloss: 0.285708
[265]	valid_0's binary_logloss: 0.285707
[266]	valid_0's binary_logloss: 0.285706
[267]	valid_0's binary_logloss: 0.285704
[268]	valid_0's binary_logloss: 0.285702
[269]	valid_0's binary_logloss: 0.285702
[270]	valid_0's binary_logloss: 0.285702
[271]	valid_0's binary_logloss: 0.285703
[272]	valid_0's binary_logloss: 0.285702
[273]	valid_0's binary_logloss: 0.285704
[274]	valid_0's binary_logloss: 0.285702
[275]	valid_0's binary_logloss: 0.285697
[276]	valid_0's binary_logloss: 0.285693
[277]	valid_0's binary_logloss: 0.28569
[278]	valid_0's binary_logloss: 0.285686
[279]	valid_0's binary_logloss: 0.285683
[280]	valid_0's binary_logloss: 0.285683
[281]	valid_0's binary_logloss: 0.285684
[282]	valid_0's binary_logloss: 0.285674
[283]	valid_0's binary_logloss: 0.285672
[284]	valid_0's binary_logloss: 0.285671
[285]	valid_0's binary_logloss: 0.285669
[286]	valid_0's binary_logloss: 0.285666
[287]	valid_0's binary_logloss: 0.285667
[288]	valid_0's binary_logloss: 0.285666
[289]	valid_0's binary_logloss: 0.285665
[290]	valid_0's binary_logloss: 0.285665
[291]	valid_0's binary_logloss: 0.285663
[292]	valid_0's binary_logloss: 0.285663
[293]	valid_0's binary_logloss: 0.285662
[294]	valid_0's binary_logloss: 0.285661
[295]	valid_0's binary_logloss: 0.28566
[296]	valid_0's binary_logloss: 0.285662
[297]	valid_0's binary_logloss: 0.285659
[298]	valid_0's binary_logloss: 0.285658
[299]	valid_0's binary_logloss: 0.285655
[300]	valid_0's binary_logloss: 0.285653
[301]	valid_0's binary_logloss: 0.28565
[302]	valid_0's binary_logloss: 0.28565
[303]	valid_0's binary_logloss: 0.285647
[304]	valid_0's binary_logloss: 0.285646
[305]	valid_0's binary_logloss: 0.285647
[306]	valid_0's binary_logloss: 0.285647
[307]	valid_0's binary_logloss: 0.285647
[308]	valid_0's binary_logloss: 0.285646
[309]	valid_0's binary_logloss: 0.285644
[310]	valid_0's binary_logloss: 0.285643
[311]	valid_0's binary_logloss: 0.285642
[312]	valid_0's binary_logloss: 0.28564
[313]	valid_0's binary_logloss: 0.285637
[314]	valid_0's binary_logloss: 0.285636
[315]	valid_0's binary_logloss: 0.285631
[316]	valid_0's binary_logloss: 0.285631
[317]	valid_0's binary_logloss: 0.285631
[318]	valid_0's binary_logloss: 0.285629
[319]	valid_0's binary_logloss: 0.285626
[320]	valid_0's binary_logloss: 0.285624
[321]	valid_0's binary_logloss: 0.285625
[322]	valid_0's binary_logloss: 0.285622
[323]	valid_0's binary_logloss: 0.285621
[324]	valid_0's binary_logloss: 0.285621
[325]	valid_0's binary_logloss: 0.28562
[326]	valid_0's binary_logloss: 0.285619
[327]	valid_0's binary_logloss: 0.285618
[328]	valid_0's binary_logloss: 0.285618
[329]	valid_0's binary_logloss: 0.285618
[330]	valid_0's binary_logloss: 0.285617
[331]	valid_0's binary_logloss: 0.285615
[332]	valid_0's binary_logloss: 0.285615
[333]	valid_0's binary_logloss: 0.285614
[334]	valid_0's binary_logloss: 0.285613
[335]	valid_0's binary_logloss: 0.285612
[336]	valid_0's binary_logloss: 0.28561
[337]	valid_0's binary_logloss: 0.285609
[338]	valid_0's binary_logloss: 0.285606
[339]	valid_0's binary_logloss: 0.285605
[340]	valid_0's binary_logloss: 0.285602
[341]	valid_0's binary_logloss: 0.2856
[342]	valid_0's binary_logloss: 0.285598
[343]	valid_0's binary_logloss: 0.285596
[344]	valid_0's binary_logloss: 0.285597
[345]	valid_0's binary_logloss: 0.285594
[346]	valid_0's binary_logloss: 0.285594
[347]	valid_0's binary_logloss: 0.285594
[348]	valid_0's binary_logloss: 0.285593
[349]	valid_0's binary_logloss: 0.285593
[350]	valid_0's binary_logloss: 0.285588
[351]	valid_0's binary_logloss: 0.285589
[352]	valid_0's binary_logloss: 0.28559
[353]	valid_0's binary_logloss: 0.285588
[354]	valid_0's binary_logloss: 0.285587
[355]	valid_0's binary_logloss: 0.285589
[356]	valid_0's binary_logloss: 0.285589
[357]	valid_0's binary_logloss: 0.285583
[358]	valid_0's binary_logloss: 0.285583
[359]	valid_0's binary_logloss: 0.285581
[360]	valid_0's binary_logloss: 0.285581
[361]	valid_0's binary_logloss: 0.285579
[362]	valid_0's binary_logloss: 0.285579
[363]	valid_0's binary_logloss: 0.285579
[364]	valid_0's binary_logloss: 0.285577
[365]	valid_0's binary_logloss: 0.285578
[366]	valid_0's binary_logloss: 0.285576
[367]	valid_0's binary_logloss: 0.285576
[368]	valid_0's binary_logloss: 0.285571
[369]	valid_0's binary_logloss: 0.285572
[370]	valid_0's binary_logloss: 0.285572
[371]	valid_0's binary_logloss: 0.285566
[372]	valid_0's binary_logloss: 0.285566
[373]	valid_0's binary_logloss: 0.285564
[374]	valid_0's binary_logloss: 0.285565
[375]	valid_0's binary_logloss: 0.285562
[376]	valid_0's binary_logloss: 0.285559
[377]	valid_0's binary_logloss: 0.285557
[378]	valid_0's binary_logloss: 0.285557
[379]	valid_0's binary_logloss: 0.285557
[380]	valid_0's binary_logloss: 0.285556
[381]	valid_0's binary_logloss: 0.285557
[382]	valid_0's binary_logloss: 0.285557
[383]	valid_0's binary_logloss: 0.285555
[384]	valid_0's binary_logloss: 0.285554
[385]	valid_0's binary_logloss: 0.285547
[386]	valid_0's binary_logloss: 0.285547
[387]	valid_0's binary_logloss: 0.285547
[388]	valid_0's binary_logloss: 0.285546
[389]	valid_0's binary_logloss: 0.285544
[390]	valid_0's binary_logloss: 0.285544
[391]	valid_0's binary_logloss: 0.285544
[392]	valid_0's binary_logloss: 0.285545
[393]	valid_0's binary_logloss: 0.285547
[394]	valid_0's binary_logloss: 0.285546
[395]	valid_0's binary_logloss: 0.285546
[396]	valid_0's binary_logloss: 0.285544
[397]	valid_0's binary_logloss: 0.285543
[398]	valid_0's binary_logloss: 0.285541
[399]	valid_0's binary_logloss: 0.285541
[400]	valid_0's binary_logloss: 0.285542
[401]	valid_0's binary_logloss: 0.285541
[402]	valid_0's binary_logloss: 0.285541
[403]	valid_0's binary_logloss: 0.285541
[404]	valid_0's binary_logloss: 0.28554
[405]	valid_0's binary_logloss: 0.285539
[406]	valid_0's binary_logloss: 0.285539
[407]	valid_0's binary_logloss: 0.285539
[408]	valid_0's binary_logloss: 0.28554
[409]	valid_0's binary_logloss: 0.285539
[410]	valid_0's binary_logloss: 0.28554
[411]	valid_0's binary_logloss: 0.28554
[412]	valid_0's binary_logloss: 0.285538
[413]	valid_0's binary_logloss: 0.285535
[414]	valid_0's binary_logloss: 0.285536
[415]	valid_0's binary_logloss: 0.285536
[416]	valid_0's binary_logloss: 0.285537
[417]	valid_0's binary_logloss: 0.285535
[418]	valid_0's binary_logloss: 0.285535
[419]	valid_0's binary_logloss: 0.285535
[420]	valid_0's binary_logloss: 0.285532
[421]	valid_0's binary_logloss: 0.285529
[422]	valid_0's binary_logloss: 0.285526
[423]	valid_0's binary_logloss: 0.285528
[424]	valid_0's binary_logloss: 0.28553
[425]	valid_0's binary_logloss: 0.285531
[426]	valid_0's binary_logloss: 0.285532
[427]	valid_0's binary_logloss: 0.28553
[428]	valid_0's binary_logloss: 0.285529
[429]	valid_0's binary_logloss: 0.28553
[430]	valid_0's binary_logloss: 0.285531
[431]	valid_0's binary_logloss: 0.28553
[432]	valid_0's binary_logloss: 0.28553
Early stopping, best iteration is:
[422]	valid_0's binary_logloss: 0.285526

In [63]:
bst_all.save_model('../models/lightGBM_prodfeats_100_12.txt', num_iteration=bst_all.best_iteration)

Choose the second model for predictions


In [36]:
### build candidates list for test ###
df_test, _ = features(test_orders)


build candidate list
('order row', 10000)
('order row', 20000)
('order row', 30000)
('order row', 40000)
('order row', 50000)
('order row', 60000)
('order row', 70000)
user related features
order related features
product related features
order_id                              int32
product_id                            int32
user_id                               int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
dow                                    int8
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
dtype: object
Index                                     72
order_id                            19333168
product_id                          19333168
user_id                             19333168
user_total_orders                    9666584
user_total_items                     9666584
total_distinct_items                 9666584
user_average_days_between_orders    19333168
user_average_basket                 19333168
dow                                  4833292
order_hour_of_day                    4833292
days_since_prior_order              19333168
days_since_ratio                    19333168
aisle_id                             4833292
department_id                        4833292
product_orders                      19333168
product_reorders                    19333168
product_reorder_rate                19333168
dtype: int64

In [38]:
# all user feature(others) 
all_users_features_df = pd.read_pickle("../data/processed/cleaned_all_users_features.pickle")
cols = all_users_features_df.columns
dow_cols = cols[cols.str.startswith('dow_')].tolist() + cols[cols.str.startswith('daytime_')].tolist()
most_cols = cols[cols.str.startswith('most_')].tolist()
top_cols = cols[cols.str.startswith('top')].tolist()
emb_cols = cols[cols.str.startswith('emb_')].tolist()
print("join with the user features")
to_join = ["user_id", 'user_avg_reordered', 'user_perc_reordered'] + most_cols + dow_cols + emb_cols + top_cols
df_test = pd.merge(df_test, all_users_features_df[to_join], on ="user_id")


join with the user features

In [41]:
df_test.shape


Out[41]:
(4833292, 137)

In [42]:
del all_users_features_df, test_orders
gc.collect()


Out[42]:
262

In [43]:
len(f_to_use)


Out[43]:
132

In [45]:
# load the model
bst_best = lgb.Booster(model_file='../models/lightGBM_morefeatures_80_10.txt')

In [47]:
print('light GBM predict')
preds = bst_best.predict(df_test[f_to_use])


light GBM predict

In [48]:
df_test['pred'] = preds

In [50]:
def generate_submission(df_test, test_orders_ids, file_name, threshold = 0.2, single_thres = True):
    """function to generate label predictions submission format"""
    if single_thres:
        TRESHOLD = threshold

        d = dict()
        for row in df_test.itertuples():
            if row.pred > TRESHOLD:
                try:
                    d[row.order_id] += ' ' + str(row.product_id)
                except:
                    d[row.order_id] = str(row.product_id)

        for order in test_orders_ids:
            if order not in d:
                d[order] = 'None'

        sub = pd.DataFrame.from_dict(d, orient='index')
        sub.reset_index(inplace=True)
        sub.columns = ['order_id', 'products']
        sub.to_csv(file_name, index=False)
    else:
        pass

In [52]:
test_order_ids = orders[orders.eval_set == 'test'].order_id

In [53]:
len(test_order_ids)


Out[53]:
75000

In [59]:
df_test[['order_id', 'pred', 'product_id']].to_csv("../data/processed/lightGBM_morefeatures_prob_preds.csv",index=False)

In [60]:
generate_submission(df_test, test_orders_ids, '../models/lightGBM_morefeatures_preds_20%thr.csv', threshold = 0.2)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-60-6930c0b68f76> in <module>()
----> 1 generate_submission(df_test, test_orders_ids, '../models/lightGBM_morefeatures_preds_20%thr.csv', threshold = 0.2)

NameError: name 'test_orders_ids' is not defined

In [ ]: